# written by Martin Schwentner 06/29/2016
# retains only sequences of Malacostraca in each file (removes all others), written to new files, second part to remove space after > in the output!

import fileinput, glob, string, sys, os, re

from Bio import SeqIO


my_files = glob.glob("*renamed.fa") # the files that will be opened


for filename in my_files:
    out_file = open(filename + '_MalacostOnly.fa', 'w+')       #the files where the output will be saved to


    for record in SeqIO.parse(filename, "fasta"):   #reads file as fasta file
        if "Panaeus_" in record.id or "Penaeus_" in record.id or "Enoplome" in record.id or "HomarusA" in record.id or "Cherax_d" in record.id or "Emerita_" in record.id or "Pagurus_" in record.id or "Libinia_" in record.id or "Metabeta" in record.id or "Typhlaty" in record.id or "Crangon_" in record.id or "Palaemon" in record.id or "Stenopus" in record.id or "Neotrype" in record.id or "Euphausi" in record.id or "Echinoga" in record.id or "Caprella" in record.id or "Cumella_" in record.id or "Idotea_b" in record.id or "PraunusF" in record.id or "Leptoche" in record.id or "Anaspide" in record.id or "Neogonod" in record.id or "Speoneba" in record.id or "Panuliru" in record.id or "Procamba" in record.id:
            print >> out_file, ">", record.id
            print >> out_file, record.seq

out_file.close()

my_files_2 = glob.glob("*_MalacostOnly.fa")
for filename2 in my_files_2:
    in_file_2 = open(filename2)
    out_file_2 = open(filename2 + '_edit.fa', 'w+')
    for lines2 in in_file_2:
        if lines2.startswith('>'):
            lines2 = lines2.replace('> ', '>')      #here the extra space is removed
        out_file_2.writelines(lines2)